In [1]:
import pandas as pd
def time_convert(x):
""" Converti una stringa dal formato hh:mm:ss in nu"""
try:
times = x.split(':')
return (3600*int(times[0])+60*int(times[1]))+int(times[2])
except:
return float('nan')
def ReadParseData(filename):
# E` necessario convertire il tempo di gara in secondi, per poterlo confrontare nelle regressioni
Cs = {'Official Time': time_convert, '5K': time_convert, 'M/F': lambda x: int(x == 'M')}
# EQUIVALENTE A:
#Cs = dict() # oppure Cs = {}
#Cs['Official Time'] = time_convert
#Cs['M/F'] = lambda x: int(x == 'M')
# Leggere la documentazione di "read_csv":
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
bm = pd.read_csv(filename, converters=Cs)
# SCARTA LE COLONNE CHE NON SERVONO
#bm.drop(bm.columns[[0,1,4,5,8,9]], axis=1, inplace=True)
#bm.drop(bm.columns[[2,3,4,5,6,7,8,9,10,11,12,13]], axis=1, inplace=True)
# OPPURE: Dedici quali serie tenere
bm = bm[['Age','M/F','5K', 'Official Time','Overall','Gender','Division']]
# Remove
print('Numero dati PRIMA del preprocessing:', len(bm))
bm = bm[bm['5K'] > 0]
print('Numero dati DOPO il preprocessing:', len(bm))
return bm
bm = ReadParseData('./data/marathon_results_2016.csv')
# STAMPA LE PRIME 3 RIGHE DEL DATA FRAME
#bm[:3]
bm[27:36]
Out[1]:
In [2]:
import numpy as np
import matplotlib.pyplot as plt
def ScatterPlot(bm, Feature1, Feature2):
sub = bm.copy()
# Seleziona feature da plottare
ym = sub[(sub['M/F'] == 1)][Feature1]
xm = sub[(sub['M/F'] == 1)][Feature2]
yf = sub[sub['M/F'] == 0][Feature1]
xf = sub[sub['M/F'] == 0][Feature2]
# Disegna il plot
fig, ax = plt.subplots(figsize=(13, 7))
ax.scatter(xm, ym, alpha=0.2, c='blue')
ax.scatter(xf, yf, alpha=0.2, c='red')
ax.legend(('Male', 'Female'))
plt.show()
ScatterPlot(bm, 'Official Time', 'Gender')
ScatterPlot(bm, 'Official Time', 'Age')
In [3]:
import numpy as np
import matplotlib.pyplot as plt
def FilterPlot(F1, F2, threshold):
# Filtra il dataframe
sub = bm[bm.Gender < threshold]
ym = sub[(sub['M/F'] == 1)][F1]
xm = sub[(sub['M/F'] == 1)][F2]
yf = sub[sub['M/F'] == 0][F1]
xf = sub[sub['M/F'] == 0][F2]
# Disegna il plot
fig, ax = plt.subplots(figsize=(13, 7))
ax.scatter(xm, ym, alpha=0.3, c='blue')
ax.scatter(xf, yf, alpha=0.3, c='red')
ax.legend(('Male', 'Female'))
plt.show()
FilterPlot('Official Time', 'Age', 5000)
FilterPlot('M/F', 'Official Time', 5000)
In [4]:
import seaborn as sns
def PlotStrip(bm, threshold=1000):
# Filtra il dataframe
sub = bm[bm.Gender < threshold]
sns.stripplot(y='Official Time', x='M/F', data=sub, jitter=True)
sns.plt.show()
PlotStrip(bm)
In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
def GenerateTrainTestSet(bm, Fs, F2, threshold=200000):
sub = bm[bm.Gender < threshold]
x_train, x_test, y_train, y_test = train_test_split(sub[Fs], sub[F2], random_state=0)
return x_train, x_test, y_train, y_test
x_train, x_test, y_train, y_test = GenerateTrainTestSet(bm, ['Official Time'], 'M/F')
In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
def PrintEvaluation(y_test, y_pred):
try:
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))
print('ACCURACY:', accuracy_score(y_test, y_pred))
print('REPORT:',classification_report(y_test, y_pred))
print('CM:', confusion_matrix(y_test, y_pred))
except:
print('Errore nel calcolo delle statistiche: Debug il tuo codice')
In [7]:
def PlotPredictions(x_test, y_test, y_pred):
# Plot valori di test
fig, ax = plt.subplots(figsize=(13, 7))
ax.scatter(x_test, y_test, alpha=0.3, c='blue')
# Plot valori predetti
fig, ax = plt.subplots(figsize=(13, 7))
ax.scatter(x_test, y_pred, alpha=0.3, c='red')
plt.show()
In [8]:
from sklearn.linear_model import LinearRegression
def RunLinearRegression(x_train, x_test, y_train):
lr = LinearRegression(normalize=False)
# Input to this function must be "DataFrames"
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
y_pred = [1 if p > 0.5 else 0 for p in y_pred]
return y_pred
y_pred = RunLinearRegression(x_train, x_test, y_train)
PlotPredictions(x_test, y_test, y_pred)
PrintEvaluation(y_test, y_pred)
In [9]:
import seaborn as sns
sns.jointplot(data=bm, x='Official Time', y='M/F', kind='reg', color='g')
sns.plt.show()
In [10]:
from sklearn.linear_model import LogisticRegression
def RunLogisticRegression(x_train, x_test, y_train):
logit = LogisticRegression(penalty='l2', class_weight='balanced')
# Input to this function must be "DataFrames"
logit.fit(x_train, y_train)
y_pred = logit.predict_proba(x_test)
print(y_pred[:3])
y_pred = [1 if p[0] < p[1] else 0 for p in y_pred]
return y_pred
y_pred = RunLogisticRegression(x_train, x_test, y_train)
PrintEvaluation(y_test, y_pred)
In [11]:
sns.jointplot(data=bm, x='Official Time', y='M/F', kind='reg', color='g', logistic=True)
sns.plt.show()
In [12]:
from sklearn import neighbors
def RunNeighborClassifier(x_train, x_test, y_train):
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
# Input to this function must be "DataFrames"
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
return y_pred
y_pred = RunNeighborClassifier(x_train, x_test, y_train)
PrintEvaluation(y_test, y_pred)